library(tidyverse)
library(ggplot2)
library(broom)
library(readxl)
library(tidytext)
library(dplyr)
library(tm)
library(SnowballC)
library(lubridate)
library(plotly)
library(ggpmisc)
Palestine_news_articles <- read_xlsx("../data/Palestine_news_articles.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
Israel_news_articles <- read_xlsx("../data/israel_news_articles.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
Gaza_news_articles <- read_xlsx("../data/gaza_news_articles.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
chat_gpt_article_headlines <- read_xlsx("../data/chat_gpt_data (1).xlsx")
additional_news_data <- read_csv("../data/news_data.csv")
## Rows: 3338 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): headline, description, date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
million_news <- read_csv("../data/abcnews-date-text 2 (1).csv")
## Rows: 1244184 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): headline_text
## dbl (1): publish_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tidy_pna <- Palestine_news_articles %>%
select("...2", "...3", "...4", "...5") %>%
rename("title" = "...2",
"journal" = "...3",
"date" = "...4",
"description" = "...5") %>%
filter(!is.na(title)) %>%
filter(title != "Title",
journal != "Journal",
date != "Date",
description != "Description") %>%
filter(!str_starts(journal, "#")) %>%
mutate(keyword = "palestine") %>%
distinct()
tidy_ina <- Israel_news_articles %>%
select("...2", "...3", "...4", "...5") %>%
rename("title" = "...2",
"journal" = "...3",
"date" = "...4",
"description" = "...5") %>%
filter(!is.na(title)) %>%
filter(title != "Title",
journal != "Journal",
date != "Date",
description != "Description") %>%
filter(!str_starts(journal, "#")) %>%
mutate(keyword = "israel") %>%
distinct()
tidy_gna <- Gaza_news_articles %>%
select("...2", "...3", "...4", "...5") %>%
rename("title" = "...2",
"journal" = "...3",
"date" = "...4",
"description" = "...5") %>%
filter(!is.na(title)) %>%
filter(title != "Title",
journal != "Journal",
date != "Date",
description != "Description") %>%
filter(!str_starts(journal, "#")) %>%
mutate(keyword = "gaza") %>%
distinct()
pidf <- full_join(tidy_pna, tidy_ina)
## Joining with `by = join_by(title, journal, date, description, keyword)`
all_data <- full_join(pidf, tidy_gna)
## Joining with `by = join_by(title, journal, date, description, keyword)`
tidy_pna %>%
count(journal) %>%
top_n(10, n) %>%
ggplot(aes(x = reorder(journal, n), y = n)) +
geom_col() +
coord_flip() +
labs(x = "Journal", y = "Frequency") +
theme_minimal()
tidy_ina %>%
count(journal) %>%
top_n(10, n) %>%
ggplot(aes(x = reorder(journal, n), y = n)) +
geom_col() +
coord_flip() +
labs(x = "Journal", y = "Frequency") +
theme_minimal()
pidf %>%
group_by(title, journal, description) %>%
summarize(count_of_string = n()) #%>%
## `summarise()` has grouped output by 'title', 'journal'. You can override using
## the `.groups` argument.
## # A tibble: 1,004 × 4
## # Groups: title, journal [983]
## title journal description count_of_string
## <chr> <chr> <chr> <int>
## 1 10 books to help you understand Israel a… UNSW S… "10 books … 1
## 2 100-200,000, Not Two Million': Israel's … Haaretz "100-200,0… 1
## 3 2913 Palestinian children killed in Gaza… Defens… "34 Palest… 1
## 4 3,268 Israelis evacuated to hospitals si… Anadol… "The Yedio… 1
## 5 326 Palestinian children killed as Israe… Defens… "DCIP has … 1
## 6 34 rockets fired from Lebanon at Israel … The Ti… "The barra… 1
## 7 40 Books to Understand Palestine ‹ Liter… Litera… "From Ghas… 1
## 8 A Left-vs.-Left House Battle, Funded by … The Ne… "Wesley Be… 1
## 9 A Palestinian and an Israeli physician s… The La… "As doctor… 1
## 10 A Prayer for the Israel Palestine Confli… The As… "A Prayer … 1
## # ℹ 994 more rows
# mutate(count_of_string = n) %>%
#mutate(keyword = if_else(count_of_string == 1, keyword, "Both"))
inner_join_df <- tidy_pna %>%
inner_join(tidy_ina, by = c("title", "journal", "description")) %>%
mutate(keyword = "Both") %>%
select(title, journal, description, keyword)
## Warning in inner_join(., tidy_ina, by = c("title", "journal", "description")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 105 of `x` matches multiple rows in `y`.
## ℹ Row 243 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
tokenized_df <- left_join(pidf, inner_join_df, by = c("title", "journal", "description"))
## Warning in left_join(pidf, inner_join_df, by = c("title", "journal", "description")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 105 of `x` matches multiple rows in `y`.
## ℹ Row 6 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
colnames(tokenized_df)[5] ="keyword"
colnames(tokenized_df)[6] ="in_both"
tokenized_df <- tokenized_df %>%
distinct() %>%
mutate(in_both = if_else(in_both == "Both", T, F))
sentiments <- get_sentiments("bing")
stop_words <- get_stopwords()
tokenized_df <- tokenized_df %>%
mutate(Title = title) %>%
unnest_tokens(word, title) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
tokenized_df2 <- tokenized_df %>%
mutate(Description = description) %>%
unnest_tokens(word, description) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
tokenized_df <- tokenized_df %>%
mutate(title_or_description = "title")
tokenized_df2 <- tokenized_df2 %>%
mutate(title_or_description = "description")
colnames(tokenized_df)[3] ="Description"
All_tokens <- tokenized_df %>%
full_join(tokenized_df2, by = c("journal", "keyword", "in_both", "Title", "word", "Description", "title_or_description"))
tokenized_df %>%
count(word, sort = TRUE) %>%
slice_max(n, n = 20) %>%
ggplot(aes(n, fct_reorder(word, n))) +
geom_col()
# data$word <- wordStem(data$word)
tokenized_df %>%
inner_join(sentiments) %>%
group_by(sentiment) %>%
filter(in_both == F) %>%
filter(keyword == "palestine") %>%
count(sentiment, word, sort = T) %>%
filter(sentiment == "positive")
## Joining with `by = join_by(word)`
## # A tibble: 0 × 3
## # Groups: sentiment [0]
## # ℹ 3 variables: sentiment <chr>, word <chr>, n <int>
tokenized_df %>%
inner_join(sentiments) %>%
group_by(sentiment) %>%
filter(in_both == F) %>%
filter(keyword == "israel") %>%
count(sentiment, word, sort = T) %>%
filter(sentiment == "positive")
## Joining with `by = join_by(word)`
## # A tibble: 0 × 3
## # Groups: sentiment [0]
## # ℹ 3 variables: sentiment <chr>, word <chr>, n <int>
pidf %>%
unnest_tokens(word, title) %>%
anti_join(stop_words) %>%
group_by(journal) %>%
count(word, sort = TRUE) %>%
mutate(total_words = sum(n)) %>%
mutate(word_percentage = n/total_words) %>%
#arrange(desc(word_percentage)) %>%
mutate(weighted_percentage = word_percentage * log(total_words)) %>% # Applying weighting to
#arrange(desc(weighted_percentage)) %>%
arrange(desc(word_percentage)) %>%
#filter(!(word %in% c("palestine", "israel", "gaza")))
filter(word == "hamas") %>%
filter(n >= 100)
## Joining with `by = join_by(word)`
## # A tibble: 0 × 6
## # Groups: journal [0]
## # ℹ 6 variables: journal <chr>, word <chr>, n <int>, total_words <int>,
## # word_percentage <dbl>, weighted_percentage <dbl>
#unnest_tokens(word, description)
# All_tokens %>%
# group_by(word) %>%
# summarize(word_count = n(), journal, date, Description, keyword, in_both, Title, word, title_or_description) %>%
# mutate(total_words = sum(n)) %>%
# mutate(word_percentage = n/total_words) %>%
# mutate(weighted_percentage = word_percentage * log(total_words)) %>%
# arrange(desc(word_percentage)) %>%
# group_by(word)
tokenized_df %>%
group_by(keyword) %>%
count(word, sort = TRUE) %>%
filter(word %in% c("palestine", "israel")) %>%
ggplot(aes(x = keyword, y = n, fill = word)) +
geom_col(position = "dodge") +
coord_flip() +
labs(title = "Words Frequency in Title")
tokenized_df2 %>%
group_by(keyword) %>%
count(word, sort = TRUE) %>%
filter(word %in% c("palestine", "israel")) %>%
ggplot(aes(x = keyword, y = n, fill = word)) +
geom_col(position = "dodge") +
coord_flip() +
labs(title = "Words Frequency in Description")
All_tokens %>%
group_by(keyword, title_or_description) %>%
count(word, sort = TRUE) %>%
filter(word %in% c("palestine", "israel")) %>%
ggplot(aes(x = keyword, y = n, fill = word)) +
geom_col(position = "dodge") +
coord_flip() +
labs(title = 'Frequency of Words "Israel" and "Palestine"') +
xlab("News Article Subject") +
ylab("Frequency") +
facet_wrap(~title_or_description, nrow = 2)
All_tokens %>%
group_by(keyword, title_or_description) %>%
count(word, sort = TRUE) %>%
filter(word %in% c("gaza", "west", "bank")) %>%
ggplot(aes(x = keyword, y = n, fill = word)) +
geom_col(position = "dodge") +
coord_flip() +
labs(title = 'Frequency of Words "Israel" and "Palestine"') +
xlab("News Article Subject") +
ylab("Frequency") +
facet_wrap(~title_or_description, nrow = 2)
gpt <- chat_gpt_article_headlines
pgpt <- chat_gpt_article_headlines %>%
select(Palestine_headings)
igpt <- chat_gpt_article_headlines %>%
select(Israel_headings)
pgpt %>%
unnest_tokens(word, Palestine_headings) %>%
anti_join(stop_words) %>%
inner_join(sentiments) %>%
group_by(sentiment) %>%
count(sentiment, word, sort = T)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## # A tibble: 246 × 3
## # Groups: sentiment [2]
## sentiment word n
## <chr> <chr> <int>
## 1 negative conflict 158
## 2 positive peace 90
## 3 positive innovation 73
## 4 negative crisis 67
## 5 positive sustainable 47
## 6 positive unity 37
## 7 positive wins 31
## 8 negative struggle 28
## 9 positive renewed 28
## 10 positive advocate 24
## # ℹ 236 more rows
igpt %>%
unnest_tokens(word, Israel_headings) %>%
anti_join(stop_words) %>%
inner_join(sentiments) %>%
group_by(sentiment) %>%
count(sentiment, word, sort = T)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## # A tibble: 167 × 3
## # Groups: sentiment [2]
## sentiment word n
## <chr> <chr> <int>
## 1 negative condemns 135
## 2 positive breakthrough 96
## 3 positive innovation 65
## 4 positive peace 61
## 5 positive innovative 52
## 6 positive diplomatic 42
## 7 positive sustainable 37
## 8 negative thwart 34
## 9 positive support 33
## 10 positive boost 30
## # ℹ 157 more rows
distinct_igpt <- igpt %>%
distinct()
distinct_pgpt <- pgpt %>%
distinct()
palestine_million <- million_news %>%
filter(str_detect(headline_text, fixed("palestine", ignore_case = T)) |
str_detect(headline_text, fixed("palestinian", ignore_case = T)) |
str_detect(headline_text, fixed("gaza", ignore_case = T)) |
str_detect(headline_text, fixed("west bank", ignore_case = T))) %>%
mutate(keyword = "palestine")
israel_million <- million_news %>%
filter(str_detect(headline_text, fixed("israel", ignore_case = T))) %>%
mutate(keyword = "israel")
#str_detect(headline_text, fixed("tel aviv", ignore_case = T)))
million_df <- full_join(palestine_million, israel_million, by = c("publish_date", "headline_text", "keyword"))
additional_news_data <- additional_news_data %>%
rename(title = headline) %>%
mutate(source = "additional_news_data")
million_df <- million_df %>%
rename(title = headline_text,
date = publish_date) %>%
mutate(source = "million_news")
million_df$date <- ymd(million_df$date)
million_df$date <- format(million_df$date, "%d-%m-%Y")
joining <- full_join(additional_news_data, million_df, by = c("title", "date", "source"))
pidf <- pidf %>%
mutate(source = "my_scaped_data")
all_real_data <- pidf %>%
#select(!date) %>%
full_join(joining, by = c("title", "description", "keyword", "source")) %>%
rename(date = date.y, untidy_date = date.x)
tokenized_df3 <- all_real_data %>%
mutate(Title = title) %>%
unnest_tokens(word, title) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
tokenized_df4 <- all_real_data %>%
mutate(Description = description) %>%
unnest_tokens(word, description) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
tokenized_df3 <- tokenized_df3 %>%
mutate(title_or_description = "title")
tokenized_df4 <- tokenized_df4 %>%
mutate(title_or_description = "description")
colnames(tokenized_df3)[3] ="Description"
colnames(tokenized_df4)[1] ="Title"
all_real_data_tokens <- tokenized_df3 %>%
full_join(tokenized_df4)
## Joining with `by = join_by(journal, untidy_date, Description, keyword, source,
## date, Title, word, title_or_description)`
all_real_data_tokens %>%
filter(!is.na(word)) %>%
filter(!is.na(keyword)) %>%
group_by(keyword) %>%
filter(keyword == "israel")
## # A tibble: 33,833 × 9
## # Groups: keyword [1]
## journal untidy_date Description keyword source date Title word
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Fox News 22 hours ago FBI Director Christop… israel my_sc… <NA> FBI … fbi
## 2 Fox News 22 hours ago FBI Director Christop… israel my_sc… <NA> FBI … dire…
## 3 Fox News 22 hours ago FBI Director Christop… israel my_sc… <NA> FBI … wray
## 4 Fox News 22 hours ago FBI Director Christop… israel my_sc… <NA> FBI … makes
## 5 Fox News 22 hours ago FBI Director Christop… israel my_sc… <NA> FBI … surp…
## 6 Fox News 22 hours ago FBI Director Christop… israel my_sc… <NA> FBI … isra…
## 7 Fox News 22 hours ago FBI Director Christop… israel my_sc… <NA> FBI … stop
## 8 Fox News 22 hours ago FBI Director Christop… israel my_sc… <NA> FBI … amid
## 9 Fox News 22 hours ago FBI Director Christop… israel my_sc… <NA> FBI … elev…
## 10 Fox News 22 hours ago FBI Director Christop… israel my_sc… <NA> FBI … thre…
## # ℹ 33,823 more rows
## # ℹ 1 more variable: title_or_description <chr>
all_real_data_tokens %>%
filter(!is.na(word)) %>%
filter(!is.na(keyword)) %>%
group_by(keyword) %>%
filter(keyword == "palestine")
## # A tibble: 27,798 × 9
## # Groups: keyword [1]
## journal untidy_date Description keyword source date Title word
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 The Nation 12 hours ago Hundreds of prote… palest… my_sc… <NA> The … move…
## 2 The Nation 12 hours ago Hundreds of prote… palest… my_sc… <NA> The … pale…
## 3 The Nation 12 hours ago Hundreds of prote… palest… my_sc… <NA> The … takes
## 4 The Nation 12 hours ago Hundreds of prote… palest… my_sc… <NA> The … moma
## 5 The Guardian 1 day ago Photos displayed … palest… my_sc… <NA> Thre… three
## 6 The Guardian 1 day ago Photos displayed … palest… my_sc… <NA> Thre… guil…
## 7 The Guardian 1 day ago Photos displayed … palest… my_sc… <NA> Thre… terr…
## 8 The Guardian 1 day ago Photos displayed … palest… my_sc… <NA> Thre… offe…
## 9 The Guardian 1 day ago Photos displayed … palest… my_sc… <NA> Thre… para…
## 10 The Guardian 1 day ago Photos displayed … palest… my_sc… <NA> Thre… imag…
## # ℹ 27,788 more rows
## # ℹ 1 more variable: title_or_description <chr>
all_real_data_tokens %>%
filter(!is.na(word)) %>%
filter(!is.na(keyword)) %>%
group_by(keyword) %>%
count(word, sort = TRUE) %>%
slice_max(n, n = 20) %>%
ggplot(aes(n, fct_reorder(word, n), fill = keyword)) +
geom_col()
all_real_data <- all_real_data %>%
mutate(year = str_sub(date, 7, 10)) #%>%
#mutate(year = as.numeric(year))
all_real_data_tokens <- all_real_data_tokens %>%
mutate(year = str_sub(date, 7, 10)) %>%
mutate(year = as.numeric(year))
all_real_data_tokens %>%
#filter(!is.na(year)) %>%
count(year) %>%
ggplot(aes(x = year, y = n)) +
geom_col() +
scale_x_continuous(breaks = seq(2003, 2023, by = 1))
## Warning: Removed 1 rows containing missing values (`position_stack()`).
all_real_data_tokens %>%
group_by(year) %>%
#filter(source == "million_news")
filter(!is.na(year), !is.na(word)) %>%
count(word, sort = TRUE) %>%
mutate(total_words = sum(n)) %>%
mutate(word_percentage = n/total_words) %>%
filter(word == "palestine") %>%
group_by(word) %>%
ggplot(aes(x = year, y = word_percentage)) +
geom_line() +
scale_x_continuous(breaks = seq(2003, 2024, by = 2)) +
geom_vline(xintercept = 2009, colour = "red") +
geom_vline(xintercept = 2017, colour = "red") +
labs(title = )
all_real_data_tokens %>%
filter(!is.na(keyword)) %>%
group_by(year, keyword) %>%
filter(!is.na(year), !is.na(word)) %>%
count(word, sort = TRUE) %>%
mutate(total_words = sum(n)) %>%
mutate(word_percentage = n/total_words) %>%
mutate(word_percentage = n/total_words) %>%
filter(word %in% c("palestine")) %>%
group_by(word) %>%
ggplot(aes(x = year, y = word_percentage, colour = keyword)) +
geom_line()+
#geom_smooth()+
scale_x_continuous(breaks = seq(2003, 2024, by = 1))
all_real_data_tokens %>%
group_by(year) %>%
#filter(source == "million_news") %>%
filter(!is.na(year), !is.na(word)) %>%
count(word, sort = TRUE) %>%
mutate(total_words = sum(n)) %>%
mutate(word_percentage = n/total_words) %>%
filter(word == "palestine") %>%
group_by(word) %>%
ggplot(aes(x = year, y = word_percentage)) +
geom_line() +
scale_x_continuous(breaks = seq(2003, 2024, by = 1)) +
geom_vline(xintercept = 2009, colour = "red") +
geom_vline(xintercept = 2017, colour = "red") +
labs(title = 'Percentage of the Word "Palestine" in Palestine/Israel News Article Headings Over Time') +
xlab("Year")+
ylab("Word Percentage")
It appears that between between 2009 to 2011 there is a large increase in the use of the word palestine particularly in news articles about palestine. Why?
2009-2010: Settlement Freeze U.S. President Barack Obama attempted to revive Israeli-Palestinian peace talks shortly after taking office in 2009. At a speech at Cairo University that year, Obama reiterated his support for a two-state solution. Why It Matters: As part of a good faith gesture, Israeli Prime Minister Benjamin Netanyahu implemented a settlement freeze, a key Palestinian demand, that lasted 10 months. While talks briefly restarted, Palestinian Authority President Mahmoud Abbas aborted the talks.
No American president ever came into office with a better understanding of the tragic history of the Palestinians or a deeper commitment to help them achieve independence than Obama. In his Cairo speech in April 2009, Obama solemnly pledged to do everything in his power to bring about Palestinian statehood. - Al Jazeera
Trump was elected in 2017
The 2013–2014 Israeli–Palestinian peace talks were part of the Israeli–Palestinian peace process. Direct negotiations between Israel and the Palestinians began on 29 July 2013 following an attempt by United States Secretary of State John Kerry to restart the peace process.
all_real_data_tokens %>%
group_by(year) %>%
filter(!is.na(year), !is.na(word)) %>%
count(word, sort = TRUE) %>%
mutate(total_words = sum(n)) %>%
mutate(word_percentage = n/total_words) %>%
filter(word %in% c("terrorism")) %>%
group_by(word) %>%
ggplot(aes(x = year, y = word_percentage)) +
geom_line()+
scale_x_continuous(breaks = seq(2003, 2023, by = 1)) +
geom_vline(xintercept = 2009, colour = "red") +
geom_vline(xintercept = 2017, colour = "red") +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
all_real_data_tokens %>%
group_by(year, keyword) %>%
filter(!is.na(year), !is.na(word)) %>%
count(word, sort = TRUE) %>%
mutate(total_words = sum(n)) %>%
mutate(word_percentage = n/total_words) %>%
filter(word %in% c("terrorism")) %>%
group_by(word) %>%
ggplot(aes(x = year, y = word_percentage, colour = keyword)) +
geom_point()+
#geom_smooth()+
scale_x_continuous(breaks = seq(2003, 2023, by = 1))
All_tokens %>%
filter(title_or_description == "title")
## # A tibble: 9,293 × 9
## journal date.x Description keyword in_both Title word title_or_description
## <chr> <chr> <chr> <chr> <lgl> <chr> <chr> <chr>
## 1 The Nati… 12 ho… Hundreds o… palest… NA The … move… title
## 2 The Nati… 12 ho… Hundreds o… palest… NA The … pale… title
## 3 The Nati… 12 ho… Hundreds o… palest… NA The … takes title
## 4 The Nati… 12 ho… Hundreds o… palest… NA The … moma title
## 5 The Guar… 1 day… Photos dis… palest… NA Thre… three title
## 6 The Guar… 1 day… Photos dis… palest… NA Thre… guil… title
## 7 The Guar… 1 day… Photos dis… palest… NA Thre… terr… title
## 8 The Guar… 1 day… Photos dis… palest… NA Thre… offe… title
## 9 The Guar… 1 day… Photos dis… palest… NA Thre… para… title
## 10 The Guar… 1 day… Photos dis… palest… NA Thre… imag… title
## # ℹ 9,283 more rows
## # ℹ 1 more variable: date.y <chr>
All_tokens %>%
filter(title_or_description == "description")
## # A tibble: 127,318 × 9
## journal date.x Description keyword in_both Title word title_or_description
## <chr> <chr> <chr> <chr> <lgl> <chr> <chr> <chr>
## 1 The Nati… <NA> Hundreds o… palest… NA The … hund… description
## 2 The Nati… <NA> Hundreds o… palest… NA The … prot… description
## 3 The Nati… <NA> Hundreds o… palest… NA The … occu… description
## 4 The Nati… <NA> Hundreds o… palest… NA The … stor… description
## 5 The Nati… <NA> Hundreds o… palest… NA The … muse… description
## 6 The Nati… <NA> Hundreds o… palest… NA The … remi… description
## 7 The Nati… <NA> Hundreds o… palest… NA The … thea… description
## 8 The Nati… <NA> Hundreds o… palest… NA The … world description
## 9 The Nati… <NA> Hundreds o… palest… NA The … evade description
## 10 The Nati… <NA> Hundreds o… palest… NA The … poli… description
## # ℹ 127,308 more rows
## # ℹ 1 more variable: date.y <chr>
All_tokens%>%
filter(!is.na(word), !is.na(keyword), !is.na(title_or_description)) %>%
group_by(keyword, title_or_description) %>%
count(word, sort = TRUE) %>%
ggplot(aes(x = n, y = n, colour = keyword)) +
geom_point()
All_tokens %>%
filter(!is.na(word), !is.na(keyword), !is.na(title_or_description)) %>%
group_by(keyword, title_or_description) %>%
count(word, sort = TRUE) %>%
ungroup() %>%
pivot_wider(names_from = title_or_description, values_from = n, values_fill = 0) %>%
ggplot(aes(x = title, y = description)) +
geom_point()
All_tokens$word <- str_replace_all(All_tokens$word, "", "")
All_tokens %>%
filter(!is.na(word), !is.na(keyword), !is.na(title_or_description)) %>%
group_by(title_or_description) %>%
count(word, sort = TRUE) %>%
ungroup() %>%
pivot_wider(names_from = title_or_description, values_from = n, values_fill = 0) %>%
mutate(total_description = sum(description)) %>%
mutate(total_title = sum(title)) %>%
mutate(title_percentage = title/total_title) %>%
mutate(description_percentage = description/total_description) %>%
filter(title_percentage > 0.0045, description_percentage > 0.0045) %>%
ggplot(aes(x = title_percentage, y = description_percentage, label = word)) +
geom_point() +
geom_text(hjust = -0.1, vjust = -0.5, size = 3) + # Adjust position and size of text labels
labs(x = "Word Count in Title", y = "Word Count in Description", title = "Word Counts Compairison") +
theme_minimal()
All_tokens %>%
filter(!is.na(word), !is.na(keyword), !is.na(title_or_description)) %>%
group_by(title_or_description) %>%
count(word, sort = TRUE) %>%
ungroup() %>%
pivot_wider(names_from = title_or_description, values_from = n, values_fill = 0) %>%
mutate(total_description = sum(description)) %>%
mutate(total_title = sum(title)) %>%
mutate(title_percentage = title/total_title) %>%
mutate(description_percentage = description/total_description) %>%
filter(title_percentage > 0, description_percentage > 0) %>%
ggplot(aes(x = title_percentage, y = description_percentage, label = word)) +
geom_point(alpha = 0.5) +
labs(x = "Word Count in Title", y = "Word Count in Description", title = "Word Counts Compairison") +
theme_minimal() +
geom_smooth(method = "lm") +
stat_poly_eq(formula = y ~ x,
aes(label = paste(after_stat(rr.label)))) +
geom_abline(intercept = 0, slope = 1, colour = "red") +
scale_x_continuous(breaks = seq(0, 0.06, by = 0.01)) +
scale_y_continuous(breaks = seq(0, 0.06, by = 0.01)) +
coord_cartesian(xlim = c(0, 0.06), ylim = c(0, 0.06))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
All_tokens %>%
filter(!is.na(word), !is.na(keyword), !is.na(title_or_description)) %>%
group_by(title_or_description) %>%
count(word, sort = TRUE) %>%
ungroup() %>%
pivot_wider(names_from = title_or_description, values_from = n, values_fill = 0) %>%
mutate(total_description = sum(description)) %>%
mutate(total_title = sum(title)) %>%
mutate(title_percentage = title/total_title) %>%
mutate(description_percentage = description/total_description) %>%
filter(title_percentage > 0, description_percentage > 0) %>%
plot_ly(x = ~title_percentage, y = ~description_percentage, text = ~word) %>%
add_markers()
all_real_data_tokens <- all_real_data_tokens %>%
mutate(fixed_date = case_when(
str_detect(untidy_date, fixed("2023")) ~ 2023,
str_detect(untidy_date, fixed("2022")) ~ 2022,
str_detect(untidy_date, fixed("2021")) ~ 2021,
str_detect(untidy_date, fixed("2020")) ~ 2020,
str_detect(untidy_date, fixed("2019")) ~ 2019,
str_detect(untidy_date, fixed("2018")) ~ 2018,
str_detect(untidy_date, fixed("2017")) ~ 2017,
str_detect(untidy_date, fixed("2016")) ~ 2016,
str_detect(untidy_date, fixed("2015")) ~ 2015,
str_detect(untidy_date, fixed("2014")) ~ 2014,
str_detect(untidy_date, fixed("2013")) ~ 2013,
str_detect(untidy_date, fixed("2012")) ~ 2012,
str_detect(untidy_date, fixed("2011")) ~ 2011,
str_detect(untidy_date, fixed("2010")) ~ 2010,
str_detect(untidy_date, fixed("2009")) ~ 2009,
str_detect(untidy_date, fixed("2008")) ~ 2008,
str_detect(untidy_date, fixed("2007")) ~ 2007,
str_detect(untidy_date, fixed("2006")) ~ 2006,
str_detect(untidy_date, fixed("2005")) ~ 2005,
str_detect(untidy_date, fixed("2004")) ~ 2004,
str_detect(untidy_date, fixed("2003")) ~ 2003,
str_detect(untidy_date, fixed("2002")) ~ 2002,
str_detect(untidy_date, fixed("2001")) ~ 2001,
str_detect(untidy_date, fixed("2000")) ~ 2000,
is.na(untidy_date) ~ NA,
.default = 2024
)) %>%
mutate(year = if_else(is.na(year), fixed_date, year))
additional_news_data <- additional_news_data %>%
mutate(year = str_sub(date, 7, 10))
all_real_data_tokens %>%
filter(is.na(keyword))
## # A tibble: 64,662 × 11
## journal untidy_date Description keyword source date Title word
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 <NA> <NA> Gaza’s journalists ar… <NA> additio… 23-1… Gaza… gaza…
## 2 <NA> <NA> Gaza’s journalists ar… <NA> additio… 23-1… Gaza… jour…
## 3 <NA> <NA> Gaza’s journalists ar… <NA> additio… 23-1… Gaza… tar…
## 4 <NA> <NA> Gaza’s journalists ar… <NA> additio… 23-1… Gaza… cas…
## 5 <NA> <NA> Gaza’s journalists ar… <NA> additio… 23-1… Gaza… isr…
## 6 <NA> <NA> Gaza’s journalists ar… <NA> additio… 23-1… Gaza… war
## 7 <NA> <NA> Hundreds of Palestini… <NA> additio… 23-1… Isr… isr…
## 8 <NA> <NA> Hundreds of Palestini… <NA> additio… 23-1… Isr… ord…
## 9 <NA> <NA> Hundreds of Palestini… <NA> additio… 23-1… Isr… death
## 10 <NA> <NA> Hundreds of Palestini… <NA> additio… 23-1… Isr… cor…
## # ℹ 64,652 more rows
## # ℹ 3 more variables: title_or_description <chr>, year <dbl>, fixed_date <dbl>
all_real_data_tokens %>%
filter(year == 2023) %>%
mutate(word = str_squish(word)) %>%
filter(word == "israel")
## # A tibble: 465 × 11
## journal untidy_date Description keyword source date Title word
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 EEAS 23.10.2023 "The EU's … palest… my_sc… <NA> Isra… isra…
## 2 The Economist Dec 7, 2023 "IF YOU WA… palest… my_sc… <NA> Isra… isra…
## 3 Carnegie Endowment … Nov 30, 20… "Navigatin… palest… my_sc… <NA> Navi… isra…
## 4 UN News 2023-10-24 "On day 17… palest… my_sc… <NA> Isra… isra…
## 5 UN News 2023-10-29 "The crisi… palest… my_sc… <NA> Isra… isra…
## 6 Anadolu Ajansı 2023-11-24 "BBC repor… palest… my_sc… <NA> BBC … isra…
## 7 The Conversation 17 Oct 2023 "The joint… palest… my_sc… <NA> How … isra…
## 8 Human Rights Watch 26 Oct 2023 "Responses… palest… my_sc… <NA> Isra… isra…
## 9 Al Jazeera 27 Nov 2023 "In 1947, … palest… my_sc… <NA> Isra… isra…
## 10 GOV.UK 27 Sept 20… "The UK ca… palest… my_sc… <NA> The … isra…
## # ℹ 455 more rows
## # ℹ 3 more variables: title_or_description <chr>, year <dbl>, fixed_date <dbl>
# all_real_data_tokens$word <- gsub("[[:space:]]", "", all_real_data_tokens$word)
#
all_real_data_tokens$word <- str_replace_all(all_real_data_tokens$word, "", "")